{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "tired-hands",
   "metadata": {},
   "source": [
    "# Transforms image folder structure\n",
    "Using an image source folder and metadata rearrage the folder structure such that each subfolder is a category containing the images of that category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "generous-grammar",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip3 install pandas==1.2.1 wget==3.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "independent-blind",
   "metadata": {},
   "outputs": [],
   "source": [
    "import wget\n",
    "wget.download(\n",
    "    'https://raw.githubusercontent.com/'\n",
    "    'elyra-ai/component-library/master/claimed_utils.py'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sorted-carbon",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from claimed_utils import unzip, zipdir\n",
    "import pandas as pd\n",
    "from shutil import copyfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "flexible-shame",
   "metadata": {},
   "outputs": [],
   "source": [
    "# @dependency codait_utils.ipynb\n",
    "# @param metadata csv file containing metadata on the images\n",
    "# @param source_folder containing the images\n",
    "# @param target_column name on the metadata containing the target (class) value\n",
    "# @param image_column name on the metadata containing\n",
    "# the file name to the image\n",
    "# @param output_folder name of the folder where to put the images to\n",
    "# @param images_zip file name containing the orginital images\n",
    "# @param data_zip file name where the images are stored into\n",
    "# @param data folder name\n",
    "# @returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "united-bouquet",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = os.environ.get('metadata', 'metadata.csv')\n",
    "source_folder = os.environ.get('source_folder', 'images')\n",
    "target_column = os.environ.get('target_column', 'finding')\n",
    "image_column = os.environ.get('image_column', 'filename')\n",
    "output_folder = os.environ.get('output_folder', 'data')\n",
    "images_zip = os.environ.get('images_zip', 'images.zip')\n",
    "data_zip = os.environ.get('data_zip', 'data.zip')\n",
    "data = os.environ.get('data', 'data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bridal-pilot",
   "metadata": {},
   "outputs": [],
   "source": [
    "unzip('.', images_zip)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "tested-function",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = pd.read_csv(metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "heated-arkansas",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.mkdir(output_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "drawn-equation",
   "metadata": {},
   "outputs": [],
   "source": [
    "folders = metadata[target_column].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "separate-framing",
   "metadata": {},
   "outputs": [],
   "source": [
    "for folder in folders:\n",
    "    os.mkdir(output_folder + '/' + folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "korean-shield",
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, row in metadata.iterrows():\n",
    "    file_name = row[image_column]\n",
    "    class_name = row[target_column]\n",
    "    copyfile(\n",
    "        source_folder + '/' + file_name, output_folder + '/' + class_name + '/' + file_name\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "realistic-listening",
   "metadata": {},
   "outputs": [],
   "source": [
    "!tree data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bound-wells",
   "metadata": {},
   "outputs": [],
   "source": [
    "zipdir(data_zip, data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
